Carregando Pacotes

library("dplyr", lib.loc="~/R/win-library/3.6")
## Warning: package 'dplyr' was built under R version 3.6.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library("ggplot2", lib.loc="~/R/win-library/3.6")
library("RgoogleMaps", lib.loc="~/R/win-library/3.6")
## Warning: package 'RgoogleMaps' was built under R version 3.6.3
library("raster", lib.loc="~/R/win-library/3.6")
## Warning: package 'raster' was built under R version 3.6.3
## Loading required package: sp
## 
## Attaching package: 'raster'
## The following object is masked from 'package:dplyr':
## 
##     select

Upload do DB

crime = read.csv(
  "C:/Users/felip/Desktop/Cursos/Kaggle/bostonCrimes_kgl/crime.csv")

Primeiras Linhas do DB

head(crime)
##   INCIDENT_NUMBER OFFENSE_CODE   OFFENSE_CODE_GROUP
## 1      I192082859          724           Auto Theft
## 2      I192082751          724           Auto Theft
## 3      I192082680          727           Auto Theft
## 4      I192082577          724           Auto Theft
## 5      I192079582          727           Auto Theft
## 6      I192078648         3114 Investigate Property
##                  OFFENSE_DESCRIPTION DISTRICT REPORTING_AREA SHOOTING
## 1                         AUTO THEFT      E18            519         
## 2                         AUTO THEFT      E18            493         
## 3 AUTO THEFT - LEASED/RENTED VEHICLE      D14            794         
## 4                         AUTO THEFT       D4            130         
## 5 AUTO THEFT - LEASED/RENTED VEHICLE      A15             47         
## 6               INVESTIGATE PROPERTY       B3            427         
##      OCCURRED_ON_DATE YEAR MONTH DAY_OF_WEEK HOUR   UCR_PART           STREET
## 1 2019-10-13 09:28:24 2019    10      Sunday    9   Part One       LINCOLN ST
## 2 2019-10-12 20:11:26 2019    10    Saturday   20   Part One METROPOLITAN AVE
## 3 2019-10-12 15:12:43 2019    10    Saturday   15   Part One       ALLSTON ST
## 4 2019-10-12 04:41:52 2019    10    Saturday    4   Part One  SAINT JAMES AVE
## 5 2019-10-02 08:08:49 2019    10   Wednesday    8   Part One        N MEAD ST
## 6 2019-09-29 06:39:00 2019     9      Sunday    6 Part Three       WILMORE ST
##        Lat      Long                    Location
## 1 42.25952 -71.12156 (42.25951765, -71.12156299)
## 2 42.26209 -71.11671 (42.26209214, -71.11670964)
## 3 42.35237 -71.13510 (42.35237455, -71.13509584)
## 4 42.34948 -71.07640 (42.34947586, -71.07640150)
## 5 42.38185 -71.06655 (42.38184582, -71.06655134)
## 6 42.27796 -71.09246 (42.27796370, -71.09246318)

Pre Processamento

# Tratando NA's:

  ## Tratando "" por NA:

  crime = crime %>% 
    mutate_all(na_if, "")


  ## Substituindo NA 'Shooting' por "N":

  crime$SHOOTING = as.character(crime$SHOOTING)
  
  crime = crime %>%
    mutate(SHOOTING = replace(SHOOTING, which(is.na(SHOOTING)), "N"))
  
  crime$SHOOTING = as.factor(crime$SHOOTING)


# Verificando MISSING VALUES:

crime %>% 
  select_all %>%
  summarise_all(funs(sum(is.na(.))))
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## Please use a list of either functions or lambdas: 
## 
##   # Simple named list: 
##   list(mean = mean, median = median)
## 
##   # Auto named with `tibble::lst()`: 
##   tibble::lst(mean, median)
## 
##   # Using lambdas
##   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once per session.
##   INCIDENT_NUMBER OFFENSE_CODE OFFENSE_CODE_GROUP OFFENSE_DESCRIPTION DISTRICT
## 1               0            0                  0                   0     2169
##   REPORTING_AREA SHOOTING OCCURRED_ON_DATE YEAR MONTH DAY_OF_WEEK HOUR UCR_PART
## 1          27253        0                0    0     0           0    0      110
##   STREET   Lat  Long Location
## 1  12391 27204 27204        0
  ## Removendo MISSING VALUES do db:
  
  crime_preproccess = crime %>%
    select_all %>%
    na.omit()


# Tratando 'Locations' (-1, -1)

otl_crime_locations = crime_preproccess %>% 
  select_all %>% 
  filter(Lat ==  -1.00000000 | Long == -1.00000000)

  ## Removendo esses dados do db:

  crime_preproccess = anti_join(crime_preproccess, otl_crime_locations)
## Joining, by = c("INCIDENT_NUMBER", "OFFENSE_CODE", "OFFENSE_CODE_GROUP",
## "OFFENSE_DESCRIPTION", "DISTRICT", "REPORTING_AREA", "SHOOTING",
## "OCCURRED_ON_DATE", "YEAR", "MONTH", "DAY_OF_WEEK", "HOUR", "UCR_PART",
## "STREET", "Lat", "Long", "Location")

Summary dos Dados Pos-Processamento

summary(crime_preproccess)
##       INCIDENT_NUMBER    OFFENSE_CODE                        OFFENSE_CODE_GROUP
##  I152071596   :    20   Min.   : 111   Motor Vehicle Accident Response: 40374  
##  I172053750   :    18   1st Qu.:1001   Larceny                        : 33569  
##  I192025403   :    15   Median :2907   Medical Assistance             : 31053  
##  I162067346   :    14   Mean   :2303   Investigate Person             : 23679  
##  I182051210   :    14   3rd Qu.:3201   Other                          : 22561  
##  I130041200-00:    13   Max.   :3831   Simple Assault                 : 19985  
##  (Other)      :395609                  (Other)                        :224482  
##                             OFFENSE_DESCRIPTION    DISTRICT      REPORTING_AREA
##  SICK/INJURED/MEDICAL - PERSON        : 24966   B2     : 62808   Min.   :  1   
##  INVESTIGATE PERSON                   : 23679   C11    : 54880   1st Qu.:178   
##  M/V - LEAVING SCENE - PROPERTY DAMAGE: 19026   D4     : 50026   Median :345   
##  VANDALISM                            : 18988   B3     : 45948   Mean   :385   
##  ASSAULT SIMPLE - BATTERY             : 18594   A1     : 41990   3rd Qu.:545   
##  VERBAL DISPUTE                       : 17535   C6     : 28531   Max.   :962   
##  (Other)                              :272915   (Other):111520                 
##  SHOOTING              OCCURRED_ON_DATE       YEAR          MONTH       
##  N:394041   2017-06-01 00:00:00:    32   Min.   :2015   Min.   : 1.000  
##  Y:  1662   2015-07-01 00:00:00:    26   1st Qu.:2016   1st Qu.: 4.000  
##             2016-08-01 00:00:00:    26   Median :2017   Median : 7.000  
##             2015-12-07 11:38:00:    25   Mean   :2017   Mean   : 6.647  
##             2017-08-01 00:00:00:    24   3rd Qu.:2018   3rd Qu.: 9.000  
##             2017-01-01 00:00:00:    23   Max.   :2019   Max.   :12.000  
##             (Other)            :395547                                  
##     DAY_OF_WEEK         HOUR             UCR_PART                 STREET      
##  Friday   :60246   Min.   : 0.00             :     0   WASHINGTON ST : 18766  
##  Monday   :56618   1st Qu.: 9.00   Other     :  1520   BLUE HILL AVE :  9408  
##  Saturday :55610   Median :14.00   Part One  : 76900   BOYLSTON ST   :  9061  
##  Sunday   :50061   Mean   :13.12   Part Three:196676   DORCHESTER AVE:  6495  
##  Thursday :57800   3rd Qu.:18.00   Part Two  :120607   TREMONT ST    :  6410  
##  Tuesday  :57190   Max.   :23.00                       HARRISON AVE  :  6083  
##  Wednesday:58178                                       (Other)       :339480  
##       Lat             Long                               Location     
##  Min.   :42.23   Min.   :-71.18   (42.34862382, -71.08277637):  1671  
##  1st Qu.:42.30   1st Qu.:-71.10   (42.36183857, -71.05976489):  1651  
##  Median :42.33   Median :-71.08   (42.28482577, -71.09137369):  1462  
##  Mean   :42.32   Mean   :-71.08   (42.32866284, -71.08563401):  1337  
##  3rd Qu.:42.35   3rd Qu.:-71.06   (42.25621592, -71.12401947):  1239  
##  Max.   :42.40   Max.   :-71.00   (42.29755533, -71.05970910):  1153  
##                                   (Other)                    :387190

Armazenando o Mapa de Boston no Google Maps

coord_boston = GetMap(center = c(lat = 42.36025, lon = -71.05829), 
                    destfile = tempfile("boston_map", fileext = ".png"), 
                    zoom = 11, type = 'google-m')

Plotando o Mapa da Cidade

boston_map = PlotOnStaticMap(coord_boston)

Ocorrências dos Crimes Distribuidos no Mapa

boston_map = PlotOnStaticMap(coord_boston)
crime_occ_map = PlotOnStaticMap(boston_map, lon = crime_preproccess$Long, 
                lat = crime_preproccess$Lat, destfile = 'crime_occ_map.png',
                FUN = points, col = "red", add = T)

Colocando o Contorno dos Bairros

shp_nB = shapefile(
  "C:/Users/felip/Desktop/Cursos/Kaggle/bostonCrimes_kgl/Boston_Neighborhoods.shp")

  df_shp_nB = as.data.frame(shp_nB)
  
    ### Colocando na mesma projecao do Google Maps:
    crs = CRS("+proj=longlat +datum=WGS84")
    shp_nB = spTransform(shp_nB, crs)
  
    ### Transformando para 'SpatialPolygons' que e o formato que o
    ### 'PlotPolysOnStaticMap' aceita o poliogono:
    
    shp_nB = SpatialPolygons(Srl = shp_nB@polygons)
  
# Importando os poligonos para nosso mapa:
    
boston_map = PlotOnStaticMap(coord_boston)
crime_occ_map = PlotOnStaticMap(boston_map, lon = crime_preproccess$Long, 
                lat = crime_preproccess$Lat, destfile = 'crime_occ_map.png',
                FUN = points, col = "red", add = T)
PlotPolysOnStaticMap(MyMap = crime_occ_map, polys = shp_nB, add = T)

Crimes Existentes

levels.default(sort(crime_preproccess[["OFFENSE_CODE_GROUP"]]))
##  [1] "Aggravated Assault"                       
##  [2] "Aircraft"                                 
##  [3] "Arson"                                    
##  [4] "Assembly or Gathering Violations"         
##  [5] "Auto Theft"                               
##  [6] "Auto Theft Recovery"                      
##  [7] "Ballistics"                               
##  [8] "Biological Threat"                        
##  [9] "Bomb Hoax"                                
## [10] "Burglary - No Property Taken"             
## [11] "Commercial Burglary"                      
## [12] "Confidence Games"                         
## [13] "Counterfeiting"                           
## [14] "Criminal Harassment"                      
## [15] "Disorderly Conduct"                       
## [16] "Drug Violation"                           
## [17] "Embezzlement"                             
## [18] "Evading Fare"                             
## [19] "Explosives"                               
## [20] "Fire Related Reports"                     
## [21] "Firearm Discovery"                        
## [22] "Firearm Violations"                       
## [23] "Fraud"                                    
## [24] "Gambling"                                 
## [25] "Harassment"                               
## [26] "Harbor Related Incidents"                 
## [27] "HOME INVASION"                            
## [28] "Homicide"                                 
## [29] "HUMAN TRAFFICKING"                        
## [30] "HUMAN TRAFFICKING - INVOLUNTARY SERVITUDE"
## [31] "Investigate Person"                       
## [32] "INVESTIGATE PERSON"                       
## [33] "Investigate Property"                     
## [34] "Landlord/Tenant Disputes"                 
## [35] "Larceny"                                  
## [36] "Larceny From Motor Vehicle"               
## [37] "License Plate Related Incidents"          
## [38] "License Violation"                        
## [39] "Liquor Violation"                         
## [40] "Manslaughter"                             
## [41] "Medical Assistance"                       
## [42] "Missing Person Located"                   
## [43] "Missing Person Reported"                  
## [44] "Motor Vehicle Accident Response"          
## [45] "Offenses Against Child / Family"          
## [46] "Operating Under the Influence"            
## [47] "Other"                                    
## [48] "Other Burglary"                           
## [49] "Phone Call Complaints"                    
## [50] "Police Service Incidents"                 
## [51] "Prisoner Related Incidents"               
## [52] "Property Found"                           
## [53] "Property Lost"                            
## [54] "Property Related Damage"                  
## [55] "Prostitution"                             
## [56] "Recovered Stolen Property"                
## [57] "Residential Burglary"                     
## [58] "Restraining Order Violations"             
## [59] "Robbery"                                  
## [60] "Search Warrants"                          
## [61] "Service"                                  
## [62] "Simple Assault"                           
## [63] "Towed"                                    
## [64] "Vandalism"                                
## [65] "Verbal Disputes"                          
## [66] "Violations"                               
## [67] "Warrant Arrests"

Encoding dos Crimes

encode_ordinal <- function(x, order = unique(x)) {
  x <- as.numeric(factor(x, levels = order, exclude = NULL))
  x
}


# Encoding Ordinal da feature 'OFFENSE_CODE_GROUP':

crime_preproccess[["OFFENSE_SORT_ENCODED"]] = 
  encode_ordinal(crime_preproccess[["OFFENSE_CODE_GROUP"]],
    order = levels.default(sort(crime_preproccess[["OFFENSE_CODE_GROUP"]])))

Barplot - Crimes

ggplot(data = crime_preproccess, aes(x = OFFENSE_SORT_ENCODED)) +
  geom_bar(aes(y = (..count..)), position = 'dodge', width = 0.5, fill = 'blue') +
  geom_text(stat = 'count', aes(label = ..count..), vjust = -1, size = 3) +
  xlab('Código dos Crimes') +
  ylab('Frequência Absoluta') +
  labs(title = "Tipos de Ocorrências", 
       subtitle = "Crimes cometidos na cidade de Boston-MA: 
Junho/2015 a Outubro/2019 (Fonte: data.boston.gov)") +
  scale_x_discrete(limits = c(1:67)) +
  theme_classic()

Pieplot - Envolvimento c/ Arma de Fogo

n = sum(crime_preproccess$SHOOTING == 'N')
y = sum(crime_preproccess$SHOOTING == 'Y')

shoot_perc = c(n, y)

piepercent = paste(round((100 * shoot_perc)/(sum(shoot_perc)), 2), 
                   "%", sep="")


# Fazendo um grafico do tipo pizza para verificar se as ocorrencias cometidas
# tiveram envolvimento de tiro ou nao:

pie(shoot_perc, labels = piepercent, col = c('darkgrey', 'white'),
    main = 'Porcentagem de Crimes\nEnvolvimento com Arma de Fogo\nBoston-MA Crimes', border = 'black')
legend("bottomright", c('No', 'Yes'), cex = 0.9, fill = c('darkgrey', 'white'))
text(0, 1, "Junho/2015 a Outubro/2019 (Fonte: data.boston.gov)", col = "black")

PreProcess - Time Series

# Criando a coluna 'DATE' para trabalhar com time series:

  ## Copiando dados de 'OCCURRED_ON_DATE':

  crime_preproccess$DATE = crime_preproccess$OCCURRED_ON_DATE

  ## Transformando para class 'character':

  crime_preproccess$DATE = as.character(crime_preproccess$DATE)

  ## Transformando para class 'POSIXlt':
  
  crime_preproccess$DATE = strptime(crime_preproccess$DATE, 
                                    format = "%Y-%m-%d %H:%M:%S")
  
  ## Alterando para class 'date':
  
  crime_preproccess$DATE = format(crime_preproccess$DATE, "%Y-%m-%d")
  crime_preproccess$DATE = as.Date.character(crime_preproccess$DATE)

Time Series - Ocorrencias por Mes/Ano

ggplot(data = crime_preproccess, aes(x = MONTH)) +
  geom_line(stat = "count", colour = 'darkblue', size = 0.5) +
  facet_grid(YEAR ~.) +
  geom_text(stat = 'count', aes(label = ..count..), 
            vjust = -1, size = 3) +
  scale_x_continuous(labels = c(1:12), 
                     breaks = c(1:12)) +
  scale_y_continuous(limits = c(4000, 10000),
                     breaks = c(seq(4000, 10000, by = 2000))) +
  xlab('Mês') +
  ylab('N° Total de Crimes') +
  labs(title = 'Número de Crimes por Mes-Ano', 
       subtitle = "Boston-MA: Junho/2015 a Setembro/2019 (Fonte: data.boston.gov)") +
  theme_minimal()
## Warning: Removed 1 rows containing missing values (geom_path).
## Warning: Removed 1 rows containing missing values (geom_text).

      ### Como a pesquisa termina no início de Outubro/2019, nao foi posta no grafico por não apresentar dados relativos do mes todo.

Time Series - Ocorrencias por Hora/Ano

ggplot(data = crime_preproccess, aes(x = HOUR)) +
    geom_line(stat = "count", colour = 'darkgrey', size = 1) +
    facet_grid(YEAR ~.) +
    geom_text(stat = 'count', aes(label = ..count..), 
              vjust = -1, size = 3) +
    scale_x_continuous(labels = c(0:23), 
                       breaks = c(0:23)) +
    scale_y_continuous(limits = c(0, 7500),
                       breaks = c(seq(0, 7500, by = 2500))) +
    xlab('Hora') +
    ylab('N° Total de Crimes') +
    labs(title = 'Número de Crimes por Hora-Ano', 
         subtitle = "Boston-MA: Junho/2015 a Setembro/2019 
         (Fonte: data.boston.gov)") +
    theme_minimal()

Heat Map - PreProccess

# Utilizando a funcao 'fortify' para transformar o shapefile em 
# dataframe e pegar as coordenadas dos poligonos:
  
shp_nB.fort = shp_nB
shp_nB.fort = fortify(shp_nB.fort)

[1] Heat Map - Crimes em Geral

ggplot(shp_nB.fort, aes(x = long, y = lat, group = group)) +
    geom_polygon(colour = 'black', fill = 'white') +
    stat_density2d(data = crime_preproccess, aes(x = Long, y = Lat, 
                                                 fill = ..level..), 
                   alpha = 0.5, inherit.aes = FALSE, geom = "polygon") +
    scale_fill_distiller(palette = "Spectral") +
    theme_minimal()

[2] Heat Map - Crimes por Ano